{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "import os\n",
    "\n",
    "import shutil\n",
    "from pprint import pprint\n",
    "import logging\n",
    "\n",
    "from ludwig.api import LudwigModel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Receive data for training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('./data/winequalityN.csv')\n",
    "train_df['quality'] = train_df['quality'].apply(str)\n",
    "train_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace white space in column names with underscore\n",
    "new_col = []\n",
    "for i in range(len(train_df.columns)):\n",
    "    new_col.append(train_df.columns[i].replace(' ', '_'))\n",
    "    \n",
    "train_df.columns = new_col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.describe().T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['quality'].value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = list(set(train_df.columns) - set(['quality']))\n",
    "features = train_df[cols]\n",
    "\n",
    "#extract categorical features\n",
    "categorical_features = []\n",
    "for p in features:\n",
    "    if train_df[p].dtype == 'object':\n",
    "        categorical_features.append(p)\n",
    "        \n",
    "print(\"categorical features:\", categorical_features, '\\n')\n",
    "\n",
    "# get numerical features\n",
    "numerical_features = list(set(features) - set(categorical_features))\n",
    "\n",
    "print(\"numerical features:\", numerical_features, \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for feature in categorical_features:\n",
    "    print(f\"# of distinct values in categorical feature '{feature}' : {train_df[feature].nunique()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Ludwig Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# template for config\n",
    "config = {'input_features':[], 'output_features': [], 'trainer':{}}\n",
    "\n",
    "# setup input features for categorical features\n",
    "for p in categorical_features:\n",
    "    a_feature = {\n",
    "        'name': p.replace(' ','_'), \n",
    "        'type': 'category'\n",
    "    }\n",
    "    config['input_features'].append(a_feature)\n",
    "\n",
    "# setup input features for numerical features\n",
    "for p in numerical_features:\n",
    "    a_feature = {\n",
    "        'name': p.replace(' ', '_'), \n",
    "        'type': 'number'\n",
    "    }\n",
    "    config['input_features'].append(a_feature)\n",
    "\n",
    "# set up output variable\n",
    "config['output_features'].append({'name': 'quality', 'type':'category'})\n",
    "\n",
    "# set default preprocessing and encoder for numerical features\n",
    "config['defaults'] = {\n",
    "    'number': {\n",
    "        'preprocessing': {\n",
    "            'missing_value_strategy': 'fill_with_mean', \n",
    "            'normalization': 'zscore'\n",
    "        },\n",
    "        'encoder': {\n",
    "            'type': 'dense',\n",
    "            'num_layers': 2\n",
    "        },\n",
    "    },\n",
    "    'category': {\n",
    "        'encoder': {\n",
    "            'type': 'sparse'\n",
    "        },\n",
    "        'decoder': {\n",
    "            'top_k': 2\n",
    "        },\n",
    "        'loss': {\n",
    "            'confidence_penalty': 0.1  \n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "# set up trainer\n",
    "config['trainer'] = {'epochs': 5}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pprint(config, indent=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize and Train LudwigModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LudwigModel(config, backend = 'local', logging_level = logging.INFO)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Inspecting Config After Model Initialization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pprint(model.config['input_features'], indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pprint(model.config['output_features'], indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_stats, train_stats, _, _ = model.experiment(\n",
    "    dataset = train_df,\n",
    "    experiment_name = 'wine_quality'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleanup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    shutil.rmtree('./results')\n",
    "    items = os.listdir('./')\n",
    "    for item in items:\n",
    "        if item.endswith(\".hdf5\") or item.endswith(\".json\") or item == '.lock_preprocessing':\n",
    "            os.remove(os.path.join('./', item))\n",
    "except Exception as e:\n",
    "    pass "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.13 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
